Loading and tidying the Yelp business dataset
business <- read_csv("./data/business.csv") %>%
mutate(name = str_sub(name, 2, -2)) %>%
mutate(address = str_sub(address, 2, -2)) %>%
filter(neighborhood != "Downtown Tampa" & neighborhood != "North Valley")
## Parsed with column specification:
## cols(
## business_id = col_character(),
## name = col_character(),
## neighborhood = col_character(),
## address = col_character(),
## city = col_character(),
## state = col_character(),
## postal_code = col_integer(),
## latitude = col_double(),
## longitude = col_double(),
## stars = col_double(),
## review_count = col_integer(),
## is_open = col_integer(),
## categories = col_character()
## )
categories <- business %>%
select(business_id, categories) %>%
separate(categories, into = c("cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7", "cat8", "cat9", "cat10", "cat11", "cat12", "cat13", "cat14", "cat15", "cat16", "cat17", "cat18", "cat19", "cat20", "cat21", "cat22", "cat23", "cat24", "cat25"), sep = ";") %>%
gather(key = "cat_id", value = "category", starts_with("cat")) %>%
select(-cat_id)
## Warning: Expected 25 pieces. Additional pieces discarded in 1 rows [5602].
## Warning: Expected 25 pieces. Missing pieces filled with `NA` in 21884
## rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
## 20, ...].
restaurant_ids <- categories %>%
filter(category == "Restaurants") %>%
distinct(business_id)
restaurants <- business %>%
left_join(restaurant_ids, by = "business_id") %>%
select(-categories)
rm(business)
Loading and tidying the attributes dataset
attributes <- read_csv("./data/attributes.csv") %>%
janitor::clean_names() %>%
select(business_id, alcohol)
## Parsed with column specification:
## cols(
## .default = col_character()
## )
## See spec(...) for full column specifications.
Joining the business and attributes datasets
restaurants <- restaurants %>%
left_join(attributes, by = "business_id")
Creating a plotly of restaurant locations
Center of Las Vegas: 36.1699° N, 115.1398° W. Plotly displayed is surrounds this center by 0.5 degrees longitude and latitude.
restaurants %>%
filter(latitude > 35.6699 & latitude < 36.6699) %>%
filter(longitude < -114.6398 & longitude > -115.6398) %>%
plot_ly(x = ~longitude, y = ~latitude, type = "scatter", mode = "markers",
alpha = 0.5,
color = ~stars, hoverinfo = 'text',
text = ~paste(name, " @", neighborhood, "\n", address, "\n", city, ", ", state, postal_code, "\n", stars, "stars on Yelp")) %>%
layout(xaxis = list(title = "Longitude"),
yaxis = list(title = "Latitude"))
<<<<<<< HEAD
=======
>>>>>>> b3016a79d0174192eca54e532ba8f779bfb4cbc4
exploratory stuff
restaurants %>%
group_by(neighborhood) %>%
<<<<<<< HEAD
count()
## # A tibble: 16 x 2
## # Groups: neighborhood [16]
## neighborhood n
## <chr> <int>
## 1 Anthem 75
## 2 Centennial 815
## 3 Chinatown 834
## 4 Downtown 1837
## 5 Eastside 1886
## 6 Northwest 1044
## 7 South Summerlin 351
## 8 Southeast 2999
## 9 Southwest 1083
## 10 Spring Valley 2710
## 11 Summerlin 896
## 12 Sunrise 704
## 13 The Lakes 162
## 14 The Strip 2734
## 15 University 261
## 16 Westside 3494
=======
count() %>%
knitr::kable()
| Anthem |
75 |
| Centennial |
815 |
| Chinatown |
834 |
| Downtown |
1837 |
| Eastside |
1886 |
| Northwest |
1044 |
| South Summerlin |
351 |
| Southeast |
2999 |
| Southwest |
1083 |
| Spring Valley |
2710 |
| Summerlin |
896 |
| Sunrise |
704 |
| The Lakes |
162 |
| The Strip |
2734 |
| University |
261 |
| Westside |
3494 |
>>>>>>> b3016a79d0174192eca54e532ba8f779bfb4cbc4
restaurants %>%
filter(alcohol == "True" | is_open == "True") %>%
group_by(neighborhood) %>%
count() %>%
knitr::kable()
| Anthem |
3 |
| Centennial |
46 |
| Chinatown |
23 |
| Downtown |
99 |
| Eastside |
66 |
| Northwest |
41 |
| South Summerlin |
45 |
| Southeast |
171 |
| Southwest |
41 |
| Spring Valley |
85 |
| Summerlin |
39 |
| Sunrise |
30 |
| The Lakes |
4 |
| The Strip |
67 |
| University |
12 |
| Westside |
162 |
<<<<<<< HEAD
restaurants %>%
=======
restaurants %>%
>>>>>>> b3016a79d0174192eca54e532ba8f779bfb4cbc4
filter(alcohol == "False" | is_open == "True") %>%
group_by(neighborhood) %>%
count() %>%
knitr::kable()
| Anthem |
2 |
| Centennial |
14 |
| Chinatown |
16 |
| Downtown |
44 |
| Eastside |
32 |
| Northwest |
15 |
| South Summerlin |
9 |
| Southeast |
87 |
| Southwest |
21 |
| Spring Valley |
35 |
| Summerlin |
11 |
| Sunrise |
15 |
| The Lakes |
3 |
| The Strip |
168 |
| University |
2 |
| Westside |
74 |
restaurants %>%
select(business_id, neighborhood, stars) %>%
distinct() %>%
group_by(neighborhood, stars) %>%
tally %>%
rename(my_count = n) %>%
spread(key = stars, value = my_count) %>%
knitr::kable()
| Anthem |
NA |
1 |
8 |
4 |
9 |
10 |
10 |
16 |
17 |
| Centennial |
14 |
18 |
47 |
77 |
113 |
138 |
142 |
101 |
165 |
| Chinatown |
24 |
11 |
40 |
60 |
94 |
168 |
184 |
141 |
112 |
| Downtown |
44 |
53 |
91 |
126 |
214 |
284 |
351 |
309 |
365 |
| Eastside |
64 |
64 |
139 |
198 |
268 |
315 |
367 |
229 |
242 |
| Northwest |
21 |
17 |
61 |
90 |
113 |
183 |
195 |
132 |
232 |
| South Summerlin |
1 |
2 |
10 |
22 |
44 |
84 |
75 |
54 |
59 |
| Southeast |
78 |
83 |
165 |
256 |
377 |
474 |
543 |
431 |
592 |
| Southwest |
13 |
22 |
49 |
89 |
114 |
166 |
204 |
185 |
241 |
| Spring Valley |
41 |
46 |
110 |
193 |
270 |
392 |
460 |
478 |
720 |
| Summerlin |
17 |
11 |
43 |
62 |
106 |
128 |
157 |
127 |
245 |
| Sunrise |
28 |
33 |
64 |
90 |
89 |
112 |
109 |
86 |
93 |
| The Lakes |
1 |
3 |
9 |
10 |
12 |
20 |
36 |
23 |
48 |
| The Strip |
19 |
37 |
117 |
277 |
448 |
596 |
629 |
383 |
228 |
| University |
5 |
10 |
15 |
24 |
30 |
57 |
48 |
45 |
27 |
| Westside |
60 |
81 |
156 |
278 |
369 |
542 |
624 |
550 |
834 |
restaurants %>%
mutate(review_count = as.numeric(review_count)) %>%
select(business_id, neighborhood, review_count) %>%
group_by(neighborhood) %>%
summarise(Average_Number_of_Reviews = mean(review_count)) %>%
knitr::kable()
| Anthem |
42.76000 |
| Centennial |
42.26012 |
| Chinatown |
91.99400 |
| Downtown |
55.80512 |
| Eastside |
55.82927 |
| Northwest |
31.30843 |
| South Summerlin |
72.31909 |
| Southeast |
46.21407 |
| Southwest |
57.39797 |
| Spring Valley |
47.95314 |
| Summerlin |
34.29018 |
| Sunrise |
16.83097 |
| The Lakes |
35.55556 |
| The Strip |
182.80102 |
| University |
56.02682 |
| Westside |
40.56898 |
<<<<<<< HEAD
=======
>>>>>>> b3016a79d0174192eca54e532ba8f779bfb4cbc4
Creating a plotly of open and closed restaurant compared to their rating on Yelp
restaurants %>%
mutate(stars = if_else(stars == 1, "1",
if_else(stars == 1.5, "1.5",
if_else(stars == 2, "2",
if_else(stars == 2.5, "2.5",
if_else(stars == 3, "3",
if_else(stars == 4, "4",
if_else(stars == 4.5, "4.5", "5"))))))),
review_count = as.numeric(review_count)) %>%
group_by(stars) %>%
plot_ly(x = ~stars, y = ~review_count, color = ~stars, type = "bar", colors = "Set3") %>%
layout(xaxis = list(title = "Stars"),
yaxis = list(title = "Number of Reviews"))
Plots of Restaurants
popular <- categories %>%
filter(category == "Restaurants" | category == "Food") %>%
distinct(business_id) %>%
left_join(categories, by = "business_id") %>%
filter(category %in% c("Bars", "Breakfast & Brunch", "Chinese", "Italian", "Mexican", "Chicken Wings", "Salad", "Sushi Bars", "Pizza", "Steakhouses"))
restaurants %>%
select(business_id, neighborhood) %>%
inner_join(popular) %>%
distinct() %>%
group_by(neighborhood, category) %>%
tally() %>%
plotly::plot_ly(x = ~neighborhood, y = ~n, type = 'bar', color = ~category, hoverinfo = 'text',
text = ~paste(neighborhood, " has ",
n, " ", category, " restaurants.")) %>%
layout(yaxis = list(title = "Restaurants"), xaxis = list(title = "", tickangle = -45), barmode = 'stack')
## Joining, by = "business_id"
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
<<<<<<< HEAD
i guess another plot? we shall see
Plots of Restaurants
popular <- categories %>%
filter(category == "Restaurants" | category == "Food") %>%
distinct(business_id) %>%
left_join(categories, by = "business_id") %>%
filter(category %in% c("Bars", "Breakfast & Brunch", "Chinese", "Italian", "Mexican", "Chicken Wings", "Salad", "Sushi Bars", "Pizza", "Steakhouses"))
restaurants %>%
select(business_id, neighborhood) %>%
inner_join(popular) %>%
distinct() %>%
group_by(neighborhood, category) %>%
tally() %>%
plotly::plot_ly(x = ~neighborhood, y = ~n, type = 'bar', color = ~category, hoverinfo = 'text',
text = ~paste(neighborhood, " has ",
n, " ", category, " restaurants.")) %>%
layout(yaxis = list(title = "Restaurants"), xaxis = list(title = "", tickangle = -45), barmode = 'stack')
## Joining, by = "business_id"
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
Geographic Plot by Categories
restaurants %>%
inner_join(popular) %>%
plot_ly(x = ~longitude, y = ~latitude, type = "scatter", mode = "markers",
alpha = 0.9,
color = ~category, hoverinfo = 'text',
text = ~paste(name, " @", neighborhood, "\n", address, "\n", city, ", ", state, postal_code, "\n", stars, "star", category, "on Yelp.")) %>%
layout(xaxis = list(title = "Longitude"),
yaxis = list(title = "Latitude"))
## Joining, by = "business_id"
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
=======
>>>>>>> b3016a79d0174192eca54e532ba8f779bfb4cbc4